#Housekeeping

date()
## [1] "Tue Nov  8 15:27:58 2022"
Sys.time()
## [1] "2022-11-08 15:27:58 CST"
getwd()
## [1] "/Users/arihangupta/Downloads/PhiCB5_FISH_Analysis"
list.files()
## [1] "10-05.fish.Rmd"              "Data"                       
## [3] "EDA_RNA_fish_data_split.Rmd" "EDA_RNAfish_ilastik.Rmd"    
## [5] "Output"                      "PhiCB5_FISH_Analysis.Rproj" 
## [7] "README.md"
.libPaths()
## [1] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library"
.Library
## [1] "/Library/Frameworks/R.framework/Resources/library"
sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: aarch64-apple-darwin20 (64-bit)
## Running under: macOS Monterey 12.3.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] digest_0.6.29   R6_2.5.1        jsonlite_1.8.0  magrittr_2.0.3 
##  [5] evaluate_0.15   stringi_1.7.6   rlang_1.0.6     cli_3.4.1      
##  [9] rstudioapi_0.13 jquerylib_0.1.4 bslib_0.3.1     rmarkdown_2.14 
## [13] tools_4.2.0     stringr_1.4.0   xfun_0.31       yaml_2.3.5     
## [17] fastmap_1.1.0   compiler_4.2.0  htmltools_0.5.2 knitr_1.39     
## [21] sass_0.4.1
search()
## [1] ".GlobalEnv"        "package:stats"     "package:graphics" 
## [4] "package:grDevices" "package:utils"     "package:datasets" 
## [7] "package:methods"   "Autoloads"         "package:base"
searchpaths()
## [1] ".GlobalEnv"                                                                    
## [2] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/stats"    
## [3] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/graphics" 
## [4] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/grDevices"
## [5] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/utils"    
## [6] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/datasets" 
## [7] "/Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library/methods"  
## [8] "Autoloads"                                                                     
## [9] "/Library/Frameworks/R.framework/Resources/library/base"
R.Version()
## $platform
## [1] "aarch64-apple-darwin20"
## 
## $arch
## [1] "aarch64"
## 
## $os
## [1] "darwin20"
## 
## $system
## [1] "aarch64, darwin20"
## 
## $status
## [1] ""
## 
## $major
## [1] "4"
## 
## $minor
## [1] "2.0"
## 
## $year
## [1] "2022"
## 
## $month
## [1] "04"
## 
## $day
## [1] "22"
## 
## $`svn rev`
## [1] "82229"
## 
## $language
## [1] "R"
## 
## $version.string
## [1] "R version 4.2.0 (2022-04-22)"
## 
## $nickname
## [1] "Vigorous Calisthenics"

Checking and installing packages

packages <- c("ggplot2", "dplyr", "tidyr", "readxl", "ggpubr", "skimr", "DataExplorer", "tidyverse", "skimr", "svglite", "readxl","tidyxl", "ggforce", "ggpubr", "ggsci", "ggthemes", "ragg", "gplots", "devtools", "readr")

### for this data we need "Biobase", "org.HS.eg.db", "AnnotationDbi" ... but it says these aren't available for this version of R -- they have probably been updated, generally theres an online central database for R and bioconductor especially bioinformatics. 


# install.packages(setdiff(packages, rownames(installed.packages()))) 
# 
# update.packages (packages) 

for (i in 1:length(packages)) {

library(packages[i], character.only = TRUE)
  
  
}
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.7     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ✔ purrr   0.3.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## 
## Attaching package: 'gplots'
## 
## 
## The following object is masked from 'package:stats':
## 
##     lowess
## 
## 
## Loading required package: usethis

Data Import

data1 <- read.csv("Data/data_table.csv", header=T)
data2 <- read.csv("Data/180-data_table.csv", header=T)
data3 <- read.csv("Data/200-data_table.csv", header=T)


rawinf0 <- read.csv("Data/data_table.csv", header=T)
rawinf180 <- read.csv("Data/180-data_table.csv", header=T)
rawuninf200 <- read.csv("Data/200-data_table.csv", header=T)
means.180.1 <- read.csv("Data/180.1.csv", header=T)
means.180.2 <- read.csv("Data/180.2.csv", header=T)
means.180.3 <- read.csv("Data/180.3.csv", header=T)
means.180.4 <- read.csv("Data/180.4.csv", header=T)
means.180.5 <- read.csv("Data/180.5.csv", header=T)
meanInt180 <- mean(c(means.180.1$Mean, means.180.2$Mean, means.180.3$Mean, means.180.4$Mean, means.180.5$Mean))

means.0.1 <- read.csv("Data/0.1.csv", header=T)
means.0.2 <- read.csv("Data/0.2.csv", header=T)
means.0.3 <- read.csv("Data/0.3.csv", header=T)
means.0.4 <- read.csv("Data/0.4.csv", header=T)
means.0.5 <- read.csv("Data/0.5.csv", header=T)
meanInt0 <- mean(c(means.0.1$Mean, means.0.2$Mean, means.0.3$Mean, means.0.4$Mean, means.0.5$Mean))

means.200.1 <- read.csv("Data/200.1.csv", header=T)
means.200.2 <- read.csv("Data/200.2.csv", header=T)
means.200.3 <- read.csv("Data/200.3.csv", header=T)
means.200.4 <- read.csv("Data/200.4.csv", header=T)
means.200.5 <- read.csv("Data/200.5.csv", header=T)
meanInt200 <- mean(c(means.200.1$Mean, means.200.2$Mean, means.200.3$Mean, means.200.4$Mean, means.200.5$Mean))

looking around

ls() # show me all the data objects that are present
##  [1] "data1"       "data2"       "data3"       "i"           "meanInt0"   
##  [6] "meanInt180"  "meanInt200"  "means.0.1"   "means.0.2"   "means.0.3"  
## [11] "means.0.4"   "means.0.5"   "means.180.1" "means.180.2" "means.180.3"
## [16] "means.180.4" "means.180.5" "means.200.1" "means.200.2" "means.200.3"
## [21] "means.200.4" "means.200.5" "packages"    "rawinf0"     "rawinf180"  
## [26] "rawuninf200"
# newData1 = as.data.frame(data1$Predicted.Class)
# newData1[,1] = "0"
# newData1 <- cbind(newData1, data1$Size.in.pixels, data1$Mean.Intensity_2, data1$Maximum.intensity_2, data1$Total.Intensity_2)
# colnames(newData1) <- c("ID", "Size", "MeanInt", "MaxInt", "SumInt")
# newData1 <- cbind(newData1, ((newData1$MeanInt)/meanInt0), ((newData1$MaxInt)/meanInt0), ((newData1$SumInt)/meanInt0), ((newData1$MeanInt - meanInt0)/meanInt0))
# colnames(newData1) <- c("ID", "Size", "MeanInt", "MaxInt", "SumInt", "StandMean", "StandMax", "StandSum", "PercentIncrease")
# 
# newinf180 = as.data.frame(data2$Predicted.Class)
# newinf180[,1] = "180"
# newinf180 <- cbind(newinf180, data1$Size.in.pixels, data1$Mean.Intensity_2, data1$Maximum.intensity_2, data1$Total.Intensity_2)
# colnames(newinf180) <- c("ID", "Size", "MeanInt", "MaxInt", "SumInt")
# newinf180 <- cbind(newinf180, ((newinf180$MeanInt)/meanInt180), ((newinf180$MaxInt)/meanInt180), ((newinf180$SumInt)/meanInt0), ((newinf180$MeanInt - meanInt180)/meanInt180))
# colnames(newinf180) <- c("ID", "Size", "MeanInt", "MaxInt", "SumInt", "StandMean", "StandMax", "StandSum", "PercentIncrease")
# 
# newuninf200 = as.data.frame(data3$Predicted.Class)
# newuninf200[,1] = "200"
# newuninf200 <- cbind(newuninf200, uninf200$Size.in.pixels, uninf200$Mean.Intensity_2, uninf200$Maximum.intensity_2, uninf200$Total.Intensity_2)
# colnames(newuninf200) <- c("ID", "Size", "MeanInt", "MaxInt", "SumInt")
# newuninf200 <- cbind(newuninf200, ((newuninf200$MeanInt)/meanInt200), ((newuninf200$MaxInt)/meanInt200), ((newuninf200$SumInt)/meanInt0), ((newuninf200$MeanInt - meanInt200)/meanInt200))
# colnames(newuninf200) <- c("ID", "Size", "MeanInt", "MaxInt", "SumInt", "StandMean", "StandMax", "StandSum", "PercentIncrease")
inf0 <- select(rawinf0, 'object_id', 'Size.in.pixels', 'Object.Area',  'Variance.of.Intensity_0', 'Variance.of.Intensity_1', 'Variance.of.Intensity_2', 'Mean.Intensity_0', 'Mean.Intensity_1', 'Mean.Intensity_2', 'Maximum.intensity_0', 'Maximum.intensity_1', 'Maximum.intensity_2', 'Minimum.intensity_0', 'Minimum.intensity_1','Minimum.intensity_2', 'Total.Intensity_0', 'Total.Intensity_1','Total.Intensity_2')

inf0 <- mutate(inf0, corrected_mean_intensity_2 = Mean.Intensity_2/meanInt0, normalized_intensity_2 = (Mean.Intensity_2-meanInt0)/meanInt0, normal_Maximum.intensity_2 = Maximum.intensity_2/meanInt0, cond = (rep("infected", nrow(inf0))), time = (rep("0", nrow(inf0))), id = (rep("inf0", nrow(inf0))))

inf0 <- mutate(inf0, cond = as.factor(cond), time = as.factor(time), id = as.factor(id))

inf180 <- select(rawinf180, 'object_id', 'Size.in.pixels', 'Object.Area',  'Variance.of.Intensity_0', 'Variance.of.Intensity_1', 'Variance.of.Intensity_2', 'Mean.Intensity_0', 'Mean.Intensity_1', 'Mean.Intensity_2', 'Maximum.intensity_0', 'Maximum.intensity_1', 'Maximum.intensity_2', 'Minimum.intensity_0', 'Minimum.intensity_1','Minimum.intensity_2', 'Total.Intensity_0', 'Total.Intensity_1','Total.Intensity_2')

inf180 <- mutate(inf180, corrected_mean_intensity_2 = Mean.Intensity_2/meanInt180, normalized_intensity_2 = (Mean.Intensity_2-meanInt180)/meanInt180, normal_Maximum.intensity_2 = Maximum.intensity_2/meanInt180, cond = (rep("infected", nrow(inf180))), time = (rep("180", nrow(inf180))), id = (rep("inf180", nrow(inf180))))

inf180 <- mutate(inf180, cond = as.factor(cond), time = as.factor(time), id = as.factor(id))

uninf200 <- select(rawuninf200, 'object_id', 'Size.in.pixels', 'Object.Area',  'Variance.of.Intensity_0', 'Variance.of.Intensity_1', 'Variance.of.Intensity_2', 'Mean.Intensity_0', 'Mean.Intensity_1', 'Mean.Intensity_2', 'Maximum.intensity_0', 'Maximum.intensity_1', 'Maximum.intensity_2', 'Minimum.intensity_0', 'Minimum.intensity_1','Minimum.intensity_2', 'Total.Intensity_0', 'Total.Intensity_1','Total.Intensity_2')

uninf200 <- mutate(uninf200, corrected_mean_intensity_2 = Mean.Intensity_2/meanInt200, normalized_intensity_2 = ((Mean.Intensity_2-meanInt200)/meanInt200), normal_Maximum.intensity_2 = Maximum.intensity_2/meanInt200, cond = (rep("uninfected", nrow(uninf200))), time = (rep("200", nrow(uninf200))), id = (rep("uninf200", nrow(uninf200))))

uninf200 <- mutate(uninf200, cond = as.factor(cond), time = as.factor(time), id = as.factor(id))

data <- rbind(inf0, inf180, uninf200)

You can use tables, plots, summaries to look at your data, but introduce() function from data explorer is a great overview

see https://cran.r-project.org/web/packages/DataExplorer/vignettes/dataexplorer-intro.html#exploratory-data-analysis

for a more formal up to date review when workign with big datasets

introduce(data)
##   rows columns discrete_columns continuous_columns all_missing_columns
## 1  563      24                3                 21                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                    0           563              13512       107104
head(data)
##   object_id Size.in.pixels Object.Area Variance.of.Intensity_0
## 1         0            239         239               1478288.0
## 2         1             91          91               2231988.0
## 3         2            159         159               2616356.2
## 4         3             98          98                935724.8
## 5         4            249         249               1715539.1
## 6         5            224         224               2386803.0
##   Variance.of.Intensity_1 Variance.of.Intensity_2 Mean.Intensity_0
## 1                11997313                214094.5         8826.088
## 2                13073474                112568.1         9129.352
## 3                14619022                225386.5         8855.333
## 4                 3687060                190662.2         8815.582
## 5                26412804                134157.7         9106.727
## 6                27148668                136859.2         8867.353
##   Mean.Intensity_1 Mean.Intensity_2 Maximum.intensity_0 Maximum.intensity_1
## 1         26949.00         4400.092               13281               33802
## 2         21995.25         4153.989               12953               27626
## 3         22669.25         4349.692               12823               29080
## 4         25786.57         4429.541               11646               30352
## 5         29540.59         4318.852               13282               38405
## 6         26727.04         4162.294               12724               37138
##   Maximum.intensity_2 Minimum.intensity_0 Minimum.intensity_1
## 1                6577                6922               16460
## 2                5167                6522               13328
## 3                5971                6282               12429
## 4                5761                7479               20910
## 5                5291                6333               14575
## 6                5249                6602               14100
##   Minimum.intensity_2 Total.Intensity_0 Total.Intensity_1 Total.Intensity_2
## 1                3265           2109435           6440810           1051622
## 2                3330            830771           2001568            378013
## 3                3404           1407998           3604411            691601
## 4                3502            863927           2527084            434095
## 5                3295           2267575           7355606           1075394
## 6                3264           1986287           5986857            932354
##   corrected_mean_intensity_2 normalized_intensity_2 normal_Maximum.intensity_2
## 1                   1.205718              0.2057181                   1.802237
## 2                   1.138281              0.1382806                   1.415867
## 3                   1.191907              0.1919074                   1.636180
## 4                   1.213788              0.2137877                   1.578636
## 5                   1.183456              0.1834564                   1.449846
## 6                   1.140556              0.1405565                   1.438337
##       cond time   id
## 1 infected    0 inf0
## 2 infected    0 inf0
## 3 infected    0 inf0
## 4 infected    0 inf0
## 5 infected    0 inf0
## 6 infected    0 inf0
tail(data)
##     object_id Size.in.pixels Object.Area Variance.of.Intensity_0
## 558       220             69          69                 2347364
## 559       221             73          73                 3457264
## 560       222            130         130                 1647870
## 561       223             28          28                 1886626
## 562       224            103         103                 1794572
## 563       225             74          74                 2203610
##     Variance.of.Intensity_1 Variance.of.Intensity_2 Mean.Intensity_0
## 558                 8144874                97106.73         9209.782
## 559                11581435               101303.98         9778.425
## 560                 2082476               102143.75         9734.077
## 561                 7466740               122256.81         9816.107
## 562                 7599682               116769.94         9702.961
## 563                 5724328                89278.86         8773.459
##     Mean.Intensity_1 Mean.Intensity_2 Maximum.intensity_0 Maximum.intensity_1
## 558         20429.78         3944.246               14328               25915
## 559         20609.82         4066.137               14563               26505
## 560         16941.14         3900.262               13881               20086
## 561         20377.14         4090.893               13365               25563
## 562         18169.52         4030.942               13497               24322
## 563         17587.12         3874.811               12302               22413
##     Maximum.intensity_2 Minimum.intensity_0 Minimum.intensity_1
## 558                4628                7196               13753
## 559                4923                7383               13725
## 560                4885                7761               13263
## 561                4968                7448               15390
## 562                4974                7611               12970
## 563                4585                6345               13212
##     Minimum.intensity_2 Total.Intensity_0 Total.Intensity_1 Total.Intensity_2
## 558                3110            635475           1409655            272153
## 559                3056            713825           1504517            296828
## 560                3030           1265430           2202348            507034
## 561                3464            274851            570560            114545
## 562                3307            999405           1871461            415187
## 563                3381            649236           1301447            286736
##     corrected_mean_intensity_2 normalized_intensity_2
## 558                   1.080035             0.08003535
## 559                   1.113412             0.11341213
## 560                   1.067991             0.06799119
## 561                   1.120191             0.12019090
## 562                   1.103775             0.10377476
## 563                   1.061022             0.06102214
##     normal_Maximum.intensity_2       cond time       id
## 558                   1.267265 uninfected  200 uninf200
## 559                   1.348043 uninfected  200 uninf200
## 560                   1.337638 uninfected  200 uninf200
## 561                   1.360365 uninfected  200 uninf200
## 562                   1.362008 uninfected  200 uninf200
## 563                   1.255490 uninfected  200 uninf200
# Relevant Data to us is Size in Pixels, maximum Intensity, Minimum Intensity, Mean Intensity, Variance of Intensity. personally I am going to remake the dataframe with these parameters. 
introduce(data)
##   rows columns discrete_columns continuous_columns all_missing_columns
## 1  563      24                3                 21                   0
##   total_missing_values complete_rows total_observations memory_usage
## 1                    0           563              13512       107104
## Sometimes real word data can be really messy, another good thing to do can be to plot the missing data as you will see below. This allows us to check the data, even though introduce() tells us numebr of missing values, we want to see where those values are, or if there are none at all. 

plot_missing(data)  # none will show up because no data is missing

# 
# table(FACTORVARIABLES, useNA = "ifany" # it can be useful to put factors in a table, none here though, include NAs to check for NA
# 
# 
# ###--- look for missing values --- #####
# 
# 
# # is.na checks for NA values
# table(is.na(GFP[1])) # use row or column numbers [] with these if you onyl want to take some columns because they need the structure of the df versus a string call using the $
# 
# # Make the distribution of NA's by columns/measurements
# sample_na = rowSums(is.na(data))
# table(sample_na)
# 
# # Make the distribution of NA's by samples
# measurement_na = colSums(is.na(data))
# table(measurement_na)
# 
# #### Make sure dimensions match up #####
# identical(data$Size.in.pixels, data$Object.Area)  ##check if they are identical colnames and the ids if you have multiple tables (subject information)
### Take a look at the distribution of all the variables ###

inf0hist_data <-plot_histogram(inf0, title = "Inf0")

inf180hist_data <-plot_histogram(inf180, title = "inf180")

uninf200hist_data <-plot_histogram(uninf200, title = "uninf200")

### Quantile-Quantile plots help us visualize deviations away from a specific probability distribution, after analyzing these plots we may have to apply transformations as you have mentioned earlier (such as a log) to do some type of linear regression. Default for the plot line is the normal distribution. 

qqplot_inf0 <-plot_qq(inf0, title = "Inf0")

qqplot_inf180 <-plot_qq(inf180, title = "Inf180")

qqplot_uninf200 <-plot_qq(uninf200, title = "Uninf200")

# Slicing and dicing can be crucial to analysis and see where you get variations. If you want to predict an outcome, it is good to look at all the features based on that value. 

# Minint_boxplot_inf0 <- plot_boxplot(inf0, by = 'Minimum.intensity_0')
# Minint_boxplot_inf0 <- plot_boxplot(inf0, by = 'Minimum.intensity_0')
# Minint_boxplot_inf0 <- plot_boxplot(inf0, by = 'Minimum.intensity_0')

PCA

inf0_pca <- drop_columns(inf0, "object_id")
inf0_pca_plot <- plot_prcomp(inf0_pca)
## Warning in plot_prcomp(inf0_pca): The following features are dropped due to zero variance:
##  * cond_infected
##  * time_0
##  * id_inf0

data_pca <- drop_columns(data, c("object_id", "Size.in.pixels"))
data_pca_plot <- plot_prcomp(data)

So I am guessing this is where the real data starts, as in the whole dataset?

you can answer in normal text, I cannot remember what that does in the knit function we will see.

Start EDA

so right now our goal is to simply get a snapshot of the data, see if anything needs to be cleaned, get an idea of the distributions and how they compare with one another. We use what we know about the data to check if the data is picking up what we know is there before we ask questions of the data we do not know the answer to .

Why is this important?

# masterData <- rbind(newData1, newinf180, newuninf200)
# p1 <- ggplot(data = masterData, mapping = aes(x= MaxInt, color = ID)) + geom_histogram(binwidth = 200)+ labs(x = "Max")
# p1
# p2 <- ggplot(data = masterData, mapping = aes(x= MeanInt, color = ID)) + geom_histogram(binwidth = 200)+ labs(x = "Mean")
# p2
# p3 <- ggplot(data = masterData, mapping = aes(x= SumInt, color = ID)) + geom_histogram(bins = 63)+ labs(x = "Sum")
# p3
# p3 <- ggplot(data = masterData, mapping = aes(x= Size, color = ID)) + geom_histogram(bins = 63)+ labs(x = "Size")
# p3
# p4 <- ggplot(data = masterData, mapping = aes(x= Size, y = SumInt ,color = ID)) + geom_point() + labs(x = "Size", y = "Sum of Intensity")
# p4
# p5 <- ggplot(data = newinf180, mapping = aes(x= SumInt, color = ID)) + geom_histogram(bins = 200)+ labs(x = "Sum")
# p5
# p6 <- ggplot(data = newinf180, mapping = aes(x= MaxInt, color = ID)) + geom_histogram(bins = 60)+ labs(x = "Max")
# p6
# p7 <- ggplot(data = masterData, mapping = aes(x= Size, y = StandSum ,color = ID)) + geom_point() + labs(x = "Size", y = "Standardized sum")
# p7
# p8 <- ggplot(masterData, aes(ID,SumInt)) + geom_boxplot() + labs(x = "Sum")
# p8
# p9 <- ggplot(data = masterData, mapping = aes(x= StandMean, color = ID)) + geom_histogram(bins = 75)+ labs(x = "Standardized Mean")
# p9
# p10 <- ggplot(data = masterData, mapping = aes(x= StandMax, color = ID)) + geom_histogram(bins = 75)+ labs(x = "Standardized Max")
# p10
# p11 <- ggplot(data = masterData, mapping = aes(x= StandSum, color = ID)) + geom_histogram(bins = 75)+ labs(x = "Standardized Sum")
# p11
# p12 <- ggplot(data = masterData, mapping = aes(x= PercentIncrease, color = ID)) + geom_histogram(bins = 75)+ labs(x = "Standardized Sum")
# p12

#For control data

# pControl0minscatter <- ggplot(data = newData1, mapping = aes(x= Size, y = SumInt ,color = ID)) + geom_point() + labs(x = "Size", y = "Sum of Intensity")
# pControl0minscatter
# 
# pControl180minscatter <- ggplot(data = newuninf200, mapping = aes(x= Size, y = SumInt ,color = ID)) + geom_point() + labs(x = "Size", y = "Sum of Intensity")
# pControl180minscatter